Generate Fig 3A
# Read in combined model training data
trainingDataMHC = readRDS("model_training_data_all_mhc.rds")
# Clean A0201 nomenclature to make consistent across models
trainingDataMHC=trainingDataMHC %>% mutate(HLA_Allele = gsub("HLA-A0201|HLA-A\\*0201|A\\*0201","HLA-A\\*02:01",HLA_Allele))
# Binary immunogenicity
trainingDataMHC=trainingDataMHC %>% mutate(Immunogenicity = ifelse(grepl("Positive",Immunogenicity),"Positive","Negative"))
# For each model find the 10 alleles with most epitopes in the training data.
ALLELES_TO_VIS=trainingDataMHC%>%group_by(HLA_Allele,Tool)%>% dplyr::summarise(n=n())%>% group_by(Tool)%>%mutate(Freq = n/sum(n))%>% filter(!HLA_Allele == "") %>% arrange(Tool, desc(n))%>% slice_max(order_by = n,n=10)%>% select(HLA_Allele,Tool)
## `summarise()` has grouped output by 'HLA_Allele'. You can override using the `.groups` argument.
# Generate a barplot of frequencies
FIGA_ALLELES_PLT=trainingDataMHC%>%group_by(Immunogenicity,HLA_Allele,Tool)%>% dplyr::summarise(n=n())%>% group_by(Tool)%>%mutate(Freq = n/sum(n))%>% arrange(Tool, desc(Freq)) %>% inner_join(ALLELES_TO_VIS)%>% mutate(HLA_Allele = gsub("HLA\\-","",HLA_Allele))%>% mutate(HLA_Allele = gsub("\\*|\\:","",HLA_Allele))%>% arrange(desc(HLA_Allele))%>%
ggbarplot(x="HLA_Allele",y="Freq",fill="Immunogenicity",position=position_dodge2())+theme_pubr(base_size = 18)+facet_wrap(~Tool,scales="free")+rotate_x_text(angle=90)+ylab("Frequency of Model Training Data")+coord_flip()
## `summarise()` has grouped output by 'Immunogenicity', 'HLA_Allele'. You can override using the `.groups` argument.
## Joining, by = c("HLA_Allele", "Tool")
Generate Fig 3B
# Label epitopes as either 'A0201+ or A0201-' and visualise the corresponding frequencies of epitopes in these grous for each model.
FIGB_A0201DOMINATION_PLT=trainingDataMHC %>% mutate(A0201_OR_NOT = ifelse(grepl("HLA-A\\*02:01|A0201|A\\*0201",HLA_Allele),"HLA-A*02:01+","HLA-A*02:01-"))%>%
group_by(Immunogenicity,A0201_OR_NOT,Tool)%>% dplyr::summarise(n=n())%>% group_by(Tool)%>%mutate(Freq = n/sum(n))%>% arrange(Tool, desc(Freq))%>% slice_max(order_by = Freq,n=10)%>% mutate(A0201_OR_NOT = gsub("HLA\\-","",A0201_OR_NOT))%>%
ggbarplot(x="A0201_OR_NOT",y="Freq",fill="Immunogenicity",position=position_dodge2())+theme_pubr(base_size = 18)+facet_wrap(~Tool,scales="free")+rotate_x_text(angle=90)+
ylab("Frequency of Model Training Data")+xlab("HLA-A*02:01 Status")+coord_flip()#+font("y.text",size=10)
## `summarise()` has grouped output by 'Immunogenicity', 'A0201_OR_NOT'. You can override using the `.groups` argument.
Generate Fig 3C
# Read in SARS-CoV-2 dataset following the benchmarking
COV2_DATA = readRDS("COV2_OTB_COMBINEDDATA.rds")
# Label A0201+ or A0201-
COV2_DATA=COV2_DATA %>% mutate(A0201_OR_NOT = ifelse(grepl("HLA-A\\*02:01|A0201|A\\*0201",HLA_Allele),"HLA-A*02:01+","HLA-A*02:01-"))
# Compare scores for each model, grouped by whether the peptide binds A0201 or not.
COHENS_D=COV2_DATA %>%group_by(Dataset) %>% cohens_d(ImmunogenicityScore~A0201_OR_NOT,ref.group = "HLA-A*02:01-")%>% mutate(effsize = paste0("cohens-d: ",round(effsize,digits=3)))
mycomparisons = list(c("+","-"))
COV2_DATA[is.na(COV2_DATA$ImmunogenicityScore), ]
## # A tibble: 0 × 7
## # … with 7 variables: Peptide <chr>, ImmunogenicityScore <dbl>,
## # HLA_Allele <chr>, ImmunogenicityCont <chr>, Immunogenicity <chr>,
## # Dataset <chr>, A0201_OR_NOT <chr>
# Simplify immunogenicity labels for visualisations
COV2_DATA=COV2_DATA %>% mutate(Immunogenicity_FULL = ifelse(Immunogenicity == 'Positive', "Pve","Nve"))
COV2_DATA=COV2_DATA %>% mutate(A201_IMM = paste0(A0201_OR_NOT,"_",Immunogenicity_FULL))
# Calcuate the median of the Nonimmunogenic A0201+ group for each model.
MEDIAN_DATA_A201_NEG = COV2_DATA %>% filter(A201_IMM == 'HLA-A*02:01+_Nve') %>% group_by(Dataset)%>% dplyr::summarise(medianIMM = median(ImmunogenicityScore))
# Calculate Cohens-d scores between A0201 status groups
COHENS_D=COV2_DATA%>% filter(A201_IMM %in% c("HLA-A*02:01+_Nve","HLA-A*02:01-_Pve"))%>% mutate(A201_IMM = gsub("HLA\\-|\\*","",A201_IMM))%>% mutate(A201_IMM = gsub("\\_","\n",A201_IMM)) %>%group_by(Dataset) %>% cohens_d(ImmunogenicityScore~A201_IMM)%>% mutate(effsize = paste0("cohens-d: ",round(effsize,digits=3)))
mycomparison = list(c("A02:01+\nNve","A02:01-\nPve"), c("A02:01+\nPve","A02:01+\nNve"))
BOXPLOTS_FIGC_PLT=COV2_DATA %>% inner_join(COHENS_D %>% select(Dataset, effsize)) %>% filter(!Dataset %in% c("netMHCpan_BA","netMHCpan_EL"))%>% mutate(A201_IMM = gsub("HLA\\-|\\*","",A201_IMM))%>% mutate(A201_IMM = gsub("\\_","\n",A201_IMM)) %>%
mutate(A201_IMM = factor(A201_IMM, levels = c("A02:01+\nPve","A02:01+\nNve","A02:01-\nPve","A02:01-\nNve")))%>%
ggplot(aes(x=(A201_IMM),y=ImmunogenicityScore,color=Immunogenicity))+geom_boxplot(notch = TRUE) + stat_summary(fun=median, geom="point", shape=19,size=5, color="green")+theme_pubr(base_size = 18)+facet_wrap(~Dataset,scales="free",nrow=2)+ geom_hline(data=MEDIAN_DATA_A201_NEG%>% filter(!Dataset %in% c("netMHCpan_BA","netMHCpan_EL")),aes(yintercept=medianIMM), linetype="dashed", color = "red", size=0.5)+xlab("Group")+rotate_x_text(angle=90)+stat_compare_means(comparisons = mycomparison,label="p.signif")#+font("x.text",size=12)+stat_pvalue_manual(COHENS_D,label="effsize",y.position = 1.0,size=3)#+scale_x_discrete(limits = rev)
## Joining, by = "Dataset"